*** LIS Cross-section Data center in Luxembourg

* email: usersupport@lisdatacenter.org 

*** LIS Self Teaching Package 2022

*** Part I: Inequality, poverty, and social policy
*** Stata version

* last change of this version of the syntax: 15-01-2022.

/* The exercises in Part I demonstrate the use of household income data along with 
useful programming techniques for working with the LIS data. With a focus on 
descriptive statistics, the exercises will lead you through the process of developing
a complete program that examiones inequality and poverty across countries.*/


** Exercise 1: Accessing the LIS databases

use $gt06h, clear
sum dhi


** Exercise 2:	Sample selection and weighting

use dhi hifactor hpub_i hpub_u hpub_a hiprivate hxitsc hpopwgt nhhmem grossnet using $gt06h, clear
gen miss_comp = 0
replace miss_comp=1 if dhi==. | hifactor==. | hpub_i==. | hpub_u ==. | hpub_a ==. | hiprivate==. | hxitsc==.
tab miss_comp
sum hpopwgt  dhi hifactor hpub_i hpub_u hpub_a hiprivate hxitsc, de
tab grossnet
sum hpopwgt dhi hifactor hpub_i hpub_u hpub_a hiprivate hxitsc [w=hpopwgt*nhhmem], de
drop if miss_comp==1
sum hpopwgt  dhi hifactor hpub_i hpub_u hpub_a hiprivate hxitsc [w=hpopwgt*nhhmem], de

use $gt06h, clear
tab currency


** Exercise 3: Working with household income variables (top and bottom coding and equivalence scales)

use dhi hifactor hpub_i hpub_u hpub_a hiprivate hxitsc hpopwgt nhhmem grossnet using $gt06h, clear
gen miss_comp = 0
quietly replace miss_comp=1 if dhi==. | hifactor==. | hpub_i==. | hpub_u ==. | hpub_a==. | hiprivate==. | hxitsc==.
quietly drop if miss_comp==1

* select only records if dhi filled 
drop if dhi==. 
* recode negative dhi into zero
cap drop dhi_tb
gen dhi_tb=dhi
replace dhi_tb=0 if dhi<0

* Apply top and bottom codes / outlier detection
gen dhi_log=log(dhi_tb) 
* keep negatives and 0 in the overall distribution of non-missing dhi 
replace dhi_log=0 if dhi_log==. & dhi_tb!=.  
* detect interquartile range 
qui sum dhi_log [w=hpopwgt],de 
gen iqr=r(p75)-r(p25) 
* detect upper bound for extreme values 
gen upper_bound=r(p75) + (iqr * 3) 
gen lower_bound=r(p25) - (iqr * 3) 
* top code income at upper bound for extreme values 
replace dhi_tb=exp(upper_bound) if dhi_tb>exp(upper_bound)  
* bottom code income at lower bound for extreme values 
replace dhi_tb=exp(lower_bound) if dhi_tb<exp(lower_bound)  

* Apply lis equivalence scale 
cap drop edhi_tb
gen edhi_tb = dhi_tb/(nhhmem^0.5)

* Generate per capita dhi (top and bottom coded) 
gen pcdhi_tb = dhi_tb/nhhmem
sum dhi dhi_tb [w=hpopwgt], de
sum pcdhi_tb edhi_tb [w=hpopwgt*nhhmem], de


** Exercise 4: Inequality: The Gini Index

use dhi hifactor hpub_i hpub_u hpub_a hiprivate hxitsc hpopwgt nhhmem grossnet using $gt06h, clear
gen miss_comp = 0
quietly replace miss_comp=1 if dhi==. | hifactor==. | hpub_i==. | hpub_u ==. | hpub_a == . | hiprivate==. | hxitsc==.
quietly drop if miss_comp==1
* select only records if dhi filled 
drop if dhi==. 
* recode negative dhi into zero
cap drop dhi_tb
gen dhi_tb=dhi
replace dhi_tb=0 if dhi<0

* Apply top and bottom codes / outlier detection
gen dhi_log=log(dhi_tb) 
* keep negatives and 0 in the overall distribution of non-missing dhi 
replace dhi_log=0 if dhi_log==. & dhi_tb!=.  
* detect interquartile range 
qui sum dhi_log [w=hpopwgt],de 
gen iqr=r(p75)-r(p25) 
* detect upper bound for extreme values 
gen upper_bound=r(p75) + (iqr * 3) 
gen lower_bound=r(p25) - (iqr * 3) 
* top code income at upper bound for extreme values 
replace dhi_tb=exp(upper_bound) if dhi_tb>exp(upper_bound)  
* bottom code income at lower bound for extreme values 
replace dhi_tb=exp(lower_bound) if dhi_tb<exp(lower_bound)  

* Apply lis equivalence scale  
cap drop edhi_tb
gen edhi_tb = dhi_tb/(nhhmem^0.5)

* Generate per capita dhi (top and bottom coded)  
gen pcdhi_tb = dhi_tb/nhhmem

* Computing gini coefficient for the three versions of household income 
ineqdec0 dhi_tb [w=hpopwgt]
ineqdec0 pcdhi_tb [w=hpopwgt*nhhmem]
ineqdec0 edhi_tb [w=hpopwgt*nhhmem]


** Exercise 5: Relative poverty rates

use dhi hifactor hpub_i hpub_u hpub_a hiprivate hxitsc hpopwgt nhhmem grossnet using $gt06h, clear
gen miss_comp = 0
quietly replace miss_comp=1 if dhi==. | hifactor==. | hpub_i==. | hpub_u==. | hpub_a==. | hiprivate==. | hxitsc==.
quietly drop if miss_comp==1
* select only records if dhi filled 
drop if dhi==. 
* recode negative dhi into zero
cap drop dhi_tb
gen dhi_tb=dhi
replace dhi_tb=0 if dhi<0

* Apply top and bottom codes / outlier detection
gen dhi_log=log(dhi_tb) 
* keep negatives and 0 in the overall distribution of non-missing dhi 
replace dhi_log=0 if dhi_log==. & dhi_tb!=.  
* detect interquartile range 
qui sum dhi_log [w=hpopwgt],de 
gen iqr=r(p75)-r(p25) 
* detect upper bound for extreme values 
gen upper_bound=r(p75) + (iqr * 3) 
gen lower_bound=r(p25) - (iqr * 3) 
* top code income at upper bound for extreme values 
replace dhi_tb=exp(upper_bound) if dhi_tb>exp(upper_bound)  
* bottom code income at lower bound for extreme values 
replace dhi_tb=exp(lower_bound) if dhi_tb<exp(lower_bound)  

*Apply lis equivalence scale  
cap drop edhi_tb
gen edhi_tb = dhi_tb/(nhhmem^0.5)
qui sum edhi_tb [w=hpopwgt*nhhmem], de

* Defining the poverty line as 50% of the median equivalised income
gen byte poor=(edhi_tb<r(p50)*0.5)

* The proportion households in poverty
sum poor [w=hpopwgt]

* The proportion individuals in poverty
sum poor [w=hpopwgt*nhhmem]


** Exercise 6: Comparing income concepts

use dhi hifactor hi33 hpublic hpub_i hpub_u hpub_a hiprivate hxitsc hpopwgt nhhmem grossnet using $gt06h, clear

gen miss_comp = 0
quietly replace miss_comp=1 if dhi==. | hifactor==. | hi33==. | hpub_i==. | hpub_u==. | hpub_a==. | hiprivate==. | hxitsc==.
quietly drop if miss_comp==1
sum hpublic hpub_i hpub_u hpub_a 
gen mi = hifactor + hiprivate + hi33
gen siti = hifactor + hiprivate + hi33 + hpub_i + hpub_u - hxitsc 
gen sa = hifactor + hiprivate + hi33 + hpub_a

foreach var in mi siti sa dhi {
	gen e`var'_b = `var'
	replace e`var'_b = 0 if `var'<0
	* Apply top and bottom codes / outlier detection
	gen e`var'_log=log(e`var'_b) 
	* keep negatives and 0 in the overall distribution of non-missing dhi 
	replace e`var'_log=0 if e`var'_log==. & e`var'_b!=.  
	* detect interquartile range 
	cap drop iqr
	cap drop upper_bound
	cap drop lower_bound
	qui sum e`var'_log [w=hpopwgt],de 
	gen iqr=r(p75)-r(p25) 
	* detect upper bound for extreme values 
	gen upper_bound=r(p75) + (iqr * 3) 
	gen lower_bound=r(p25) - (iqr * 3) 
	* top code income at upper bound for extreme values 
	replace e`var'_b=exp(upper_bound) if e`var'_b>exp(upper_bound) 
	* bottom code income at lower bound for extreme values 
	replace e`var'_b=exp(lower_bound) if e`var'_b<exp(lower_bound)  
	replace e`var'_b = (e`var'_b/(nhhmem^0.5)) 
}

quietly sum edhi_b [w=hpopwgt*nhhmem], de
global povline = r(p50)*0.5

foreach var in mi siti sa dhi {
  quietly gen byte poor`var'=(e`var'_b<$povline)
  ineqdec0 e`var'_b [w=hpopwgt*nhhmem]
  sum poor`var' [w=hpopwgt*nhhmem]
  }


** Exercise 7: Comparing multiple countries

program define make_variables
gen miss_comp = 0
quietly replace miss_comp=1 if dhi==. | hifactor==. | hi33==. | hpub_i==. | hpub_u==. | hpub_a==. | hiprivate==. | hxitsc==.
quietly drop if miss_comp==1
sum dhi [w=hwgt], de
gen mi = hifactor + hiprivate + hi33
gen siti = hifactor + hiprivate + hi33 + hpub_i + hpub_u - hxitsc 
gen sa = hifactor + hiprivate + hi33 + hpub_a

foreach var in mi siti sa dhi {
	cap drop e`var'_b
	gen e`var'_b = `var'
	replace e`var'_b = 0 if `var'<0
	* Apply top and bottom codes / outlier detection
	gen e`var'_log=log(e`var'_b) 
	* keep negatives and 0 in the overall distribution of non-missing dhi 
	replace e`var'_log=0 if e`var'_log==. & e`var'_b!=.  
	* detect interquartile range 
	cap drop iqr
	cap drop upper_bound
	cap drop lower_bound
	qui sum e`var'_log [w=hwgt],de 
	gen iqr=r(p75)-r(p25) 
	* detect upper bound for extreme values 
	gen upper_bound=r(p75) + (iqr * 3) 
	gen lower_bound=r(p25) - (iqr * 3) 
	* top code income at upper bound for extreme values 
	replace e`var'_b=exp(upper_bound) if e`var'_b>exp(upper_bound) 
	* bottom code income at lower bound for extreme values 
	replace e`var'_b=exp(lower_bound) if e`var'_b<exp(lower_bound)  
	replace e`var'_b = (e`var'_b/(nhhmem^0.5)) 
}

quietly sum edhi_b [w=hwgt*nhhmem], de
global povline = r(p50)*0.5
end

foreach ccyy in gt06 us04 dk04 hu05 il05 {
    di "`ccyy'"
    use dhi hifactor hi33 hpublic hpub_a hpub_i hpub_u hiprivate hxitsc hwgt nhhmem grossnet using $`ccyy'h, clear
sum hpublic hpub_i hpub_u hpub_a
tab grossnet
   quietly make_variables
   foreach var in mi siti sa dhi {
       quietly gen byte poor`var'=(e`var'_b<$povline)
       ineqdec0 e`var'_b [w=hwgt*nhhmem]
       sum poor`var' [w=hwgt*nhhmem]
  }
}


** Exercise 8: Producing compact and concise output

program drop make_variables
program define make_variables
gen miss_comp = 0
quietly replace miss_comp=1 if dhi==. | hifactor==. | hi33==. | hpub_i==. | hpub_u==. | hpub_a==. | hiprivate==. | hxitsc==.
quietly drop if miss_comp==1
sum dhi [w=hwgt], de
gen mi = hifactor + hiprivate + hi33
gen siti = hifactor + hiprivate + hi33 + hpub_i + hpub_u - hxitsc 
gen sa = hifactor + hiprivate + hi33 + hpub_a

foreach var in mi siti sa dhi {
	cap drop e`var'_b
	gen e`var'_b = `var'
	replace e`var'_b = 0 if `var'<0
	* Apply top and bottom codes / outlier detection
	gen e`var'_log=log(e`var'_b) 
	* keep negatives and 0 in the overall distribution of non-missing dhi 
	replace e`var'_log=0 if e`var'_log==. & e`var'_b!=.  
	* detect interquartile range 
	cap drop iqr
	cap drop upper_bound
	cap drop lower_bound
	qui sum e`var'_log [w=hwgt],de 
	gen iqr=r(p75)-r(p25) 
	* detect upper bound for extreme values 
	gen upper_bound=r(p75) + (iqr * 3) 
	gen lower_bound=r(p25) - (iqr * 3) 
	* top code income at upper bound for extreme values 
	replace e`var'_b=exp(upper_bound) if e`var'_b>exp(upper_bound) 
	* bottom code income at lower bound for extreme values 
	replace e`var'_b=exp(lower_bound) if e`var'_b<exp(lower_bound)  
	replace e`var'_b = (e`var'_b/(nhhmem^0.5)) 
}
quietly sum edhi_b [w=hwgt*nhhmem], de
global povline = r(p50)*0.5
end

foreach ccyy in gt06 us04 dk04 hu05 il05 {
	quietly use dhi hifactor   hi33 hpublic hpub_a hpub_i hpub_u hiprivate hxitsc hwgt nhhmem grossnet using $`ccyy'h, clear
	quietly make_variables
	foreach var in mi siti sa dhi {
	quietly gen byte poor`var'=(e`var'_b<$povline)
	
	*Calculate and store gini, relative poverty rate
	quietly ineqdec0 e`var'_b [w=hwgt*nhhmem]
	local gini`var' : di %9.3f  r(gini) 
	quietly sum poor`var' [w=hwgt*nhhmem]
	local povrate`var' : di %9.2f r(mean)*100 
}

/* Output gini and poverty rate measures as comma separated values. 
If this is the first country being computed, output a line of column 
headers first */ 

if "`ccyy'" == "gt06" di "dataset,gini_mi,gini_siti,gini_sa,gini_dhi,povrate_mi,povrate_siti,povrate_sa,povrate_dhi"
di "`ccyy',`ginimi',`ginisiti',`ginisa',`ginidhi',`povratemi',`povratesiti',`povratesa',`povratedhi'"
}


** Exercise 9: Producing graphs

global varshh "dname dhi hwgt nhhmem" 
global datasets "gt06 us04 dk04" 

program define make_data 
foreach ccyy in $datasets { 
	use $varshh using $`ccyy'h, clear 
	if "`ccyy'" != "gt06" { 
		append using ${mydata}exercise-part1
	} 
	save ${mydata}exercise-part1, replace 
} 
end

program drop make_variables
program define make_variables
encode dname, gen (ctry)
gen miss_comp = 0
quietly replace miss_comp=1 if dhi==.
quietly drop if miss_comp==1
* select only records if dhi filled 
drop if dhi==. 
* recode negative dhi into zero
cap drop dhi_tb
gen dhi_tb=dhi
replace dhi_tb=0 if dhi<0

* Apply top and bottom codes / outlier detection
gen dhi_log=log(dhi_tb) 
* keep negatives and 0 in the overall distribution of non-missing dhi 
replace dhi_log=0 if dhi_log==. & dhi_tb!=.  
* detect interquartile range 
qui sum dhi_log [w=hwgt],de 
gen iqr=r(p75)-r(p25) 
* detect upper bound for extreme values 
gen upper_bound=r(p75) + (iqr * 3) 
gen lower_bound=r(p25) - (iqr * 3) 
* top code income at upper bound for extreme values 
replace dhi_tb=exp(upper_bound) if dhi_tb>exp(upper_bound)  
* bottom code income at lower bound for extreme values 
replace dhi_tb=exp(lower_bound) if dhi_tb<exp(lower_bound)  

cap drop edhi_tb
gen edhi_tb = dhi_tb/(nhhmem^0.5)
end

* Graph for Lorenz Curve
quietly make_data 
use dname dhi hwgt nhhmem using ${mydata}exercise-part1, clear
quietly make_variables
lorenz estimate edhi_tb [w=hwgt*nhhmem], over (ctry)
lorenz graph, aspectratio(1) xlabel (, grid) overlay
graphexportpdf $mypdf/graph-lorenz

